In [1]:
import sys

print(f"Interpreter dir: {sys.executable}")
import os

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("../")

print(f"Working dir: {os.getcwd()}")
%load_ext autoreload
%autoreload 2
Interpreter dir: /Users/jsg/Documents/Stor/PdM_mockup/.venv_storpdm/bin/python3.7
Working dir: /Users/jsg/Documents/Stor/PdM_mockup
In [2]:
# General
from pathlib import Path

# Data & modeling
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import lightgbm as lgb
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score
from sklearn.neural_network import MLPRegressor


# PLotting
import matplotlib.pyplot as plt
import plotly

# plotly.offline.init_notebook_mode(connected=True)

# Custom package
from storpdm import DATA_PATH
from storpdm.make_dataset import download_dataset, import_dataset
from storpdm.visualize import (
    visualise_sensor_correlation_all_engine,
    visualise_sensor_data_distribution,
    plot_time_history_all_engines,
)
from storpdm.build_features import (
    find_correlated_data,
    list_correlated_data,
    find_time_independent_columns,
    add_calculated_rul,
    prepare_training_data,
)
from storpdm.train_model import EstimatorSelectionHelper

Problem definition and experimental scenario

Data sets consists of multiple multivariate time series. Each data set is further divided into training and test subsets. Each time series is from a different engine, i.e., the data can be considered to be from a fleet of engines of the same type. Each engine starts with different degrees of initial wear and manufacturing variation which is unknown to the user. This wear and variation is considered normal, i.e., it is not considered a fault condition. There are three operational settings that have a substantial effect on engine performance. These settings are also included in the data. The data is contaminated with sensor noise.

The engine is operating normally at the start of each time series, and develops a fault at some point during the series: the fault grows in magnitude until system failure.

The aim is to model and predict the Remaining Time of Failure (RUL) as accurately as possible. In a practical setup, such model can be used for:

  • root-cause analysos
  • Real-time monitoring

Modeling process

The general modeling outline is described in the diagram below and is as follows:

  1. Load data
  2. Pre-process data
    • Transform
    • Visualization
  3. Model training and selection
  4. Using the best modeling approach we do:
    • Predictions
    • Root-cause analysis

In this notebook we model the failures from two different points of view

  • How many cycles are left until the engine fails?
  • Will the engine fail in less than 10 cycles? What is the probability?
In [3]:
from IPython.display import SVG
SVG(filename='reports/data flow.svg')
Out[3]:

Download raw data, unzip and load to memory

In [4]:
## Run the download function only once
if len(list(Path('data/raw').glob('train_*.txt'))) == 4:
    print('Raw data has been downloaded')
else:
    download_dataset()
Raw data has been downloaded
In [5]:
# Load datasets
df_rul, df_train, df_test = import_dataset(filename="FD001")
display(df_train)
id cycle op1 op2 op3 FanInletTemp LPCOutletTemp HPCOutletTemp LPTOutletTemp FanInletPres BypassDuctPres TotalHPCOutletPres PhysFanSpeed PhysCoreSpeed EnginePresRatio StaticHPCOutletPres FuelFlowRatio CorrFanSpeed CorrCoreSpeed BypassRatio BurnerFuelAirRatio BleedEnthalpy DemandFanSpeed DemandCorrFanSpeed HPTCoolantBleed LPTCoolantBleed
0 1 1 -0.0007 -0.0004 100.0 518.67 641.82 1589.70 1400.60 14.62 21.61 554.36 2388.06 9046.19 1.3 47.47 521.66 2388.02 8138.62 8.4195 0.03 392 2388 100.0 39.06 23.4190
1 1 2 0.0019 -0.0003 100.0 518.67 642.15 1591.82 1403.14 14.62 21.61 553.75 2388.04 9044.07 1.3 47.49 522.28 2388.07 8131.49 8.4318 0.03 392 2388 100.0 39.00 23.4236
2 1 3 -0.0043 0.0003 100.0 518.67 642.35 1587.99 1404.20 14.62 21.61 554.26 2388.08 9052.94 1.3 47.27 522.42 2388.03 8133.23 8.4178 0.03 390 2388 100.0 38.95 23.3442
3 1 4 0.0007 0.0000 100.0 518.67 642.35 1582.79 1401.87 14.62 21.61 554.45 2388.11 9049.48 1.3 47.13 522.86 2388.08 8133.83 8.3682 0.03 392 2388 100.0 38.88 23.3739
4 1 5 -0.0019 -0.0002 100.0 518.67 642.37 1582.85 1406.22 14.62 21.61 554.00 2388.06 9055.15 1.3 47.28 522.19 2388.04 8133.80 8.4294 0.03 393 2388 100.0 38.90 23.4044
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20626 100 196 -0.0004 -0.0003 100.0 518.67 643.49 1597.98 1428.63 14.62 21.61 551.43 2388.19 9065.52 1.3 48.07 519.49 2388.26 8137.60 8.4956 0.03 397 2388 100.0 38.49 22.9735
20627 100 197 -0.0016 -0.0005 100.0 518.67 643.54 1604.50 1433.58 14.62 21.61 550.86 2388.23 9065.11 1.3 48.04 519.68 2388.22 8136.50 8.5139 0.03 395 2388 100.0 38.30 23.1594
20628 100 198 0.0004 0.0000 100.0 518.67 643.42 1602.46 1428.18 14.62 21.61 550.94 2388.24 9065.90 1.3 48.09 520.01 2388.24 8141.05 8.5646 0.03 398 2388 100.0 38.44 22.9333
20629 100 199 -0.0011 0.0003 100.0 518.67 643.23 1605.26 1426.53 14.62 21.61 550.68 2388.25 9073.72 1.3 48.39 519.67 2388.23 8139.29 8.5389 0.03 395 2388 100.0 38.29 23.0640
20630 100 200 -0.0032 -0.0005 100.0 518.67 643.85 1600.38 1432.14 14.62 21.61 550.79 2388.26 9061.48 1.3 48.20 519.30 2388.26 8137.33 8.5036 0.03 396 2388 100.0 38.37 23.0522

20631 rows × 26 columns

Pre-process data and visualize

  • Remove correlated sensors
  • Constant data with time
  • ...
In [6]:
plt.figure(figsize=(15, 15))
visualise_sensor_correlation_all_engine(df_train)
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb39738e4d0>
In [7]:
# Reduce / Eliminate highly-correlated sensors.
correlation_threshold = 0.95
correlated_data = find_correlated_data(df_train, correlation_threshold)
columns_to_be_removed = list_correlated_data(correlated_data)
In [8]:
print(f"Removing {columns_to_be_removed} from columns")
df_train_proc = df_train.drop(columns_to_be_removed, axis=1)
Removing ['FanInletPres', 'EnginePresRatio', 'BurnerFuelAirRatio', 'CorrCoreSpeed'] from columns
In [9]:
# Visualise correlations, extreme ones should have been removed
plt.figure(figsize=(15, 15))
visualise_sensor_correlation_all_engine(df_train_proc)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb397570750>
In [10]:
plt.figure(figsize=(15, 15))
fig = visualise_sensor_data_distribution(df_train_proc)
/Users/jsg/Documents/Stor/PdM_mockup/.venv_storpdm/lib/python3.7/site-packages/seaborn/distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
/Users/jsg/Documents/Stor/PdM_mockup/.venv_storpdm/lib/python3.7/site-packages/seaborn/distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
/Users/jsg/Documents/Stor/PdM_mockup/.venv_storpdm/lib/python3.7/site-packages/seaborn/distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
/Users/jsg/Documents/Stor/PdM_mockup/.venv_storpdm/lib/python3.7/site-packages/seaborn/distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
<Figure size 1080x1080 with 0 Axes>
In [11]:
# Remove data that does not change with time.
time_independent_columns = find_time_independent_columns(df_train_proc)
print(f"Removing constant columns: {time_independent_columns}")

df_train_proc2 = df_train_proc.drop(time_independent_columns, axis=1)
display(df_train_proc2)
Removing constant columns: ['op3', 'FanInletTemp', 'BypassDuctPres', 'PhysFanSpeed', 'CorrFanSpeed', 'DemandFanSpeed', 'DemandCorrFanSpeed']
id cycle op1 op2 LPCOutletTemp HPCOutletTemp LPTOutletTemp TotalHPCOutletPres PhysCoreSpeed StaticHPCOutletPres FuelFlowRatio BypassRatio BleedEnthalpy HPTCoolantBleed LPTCoolantBleed
0 1 1 -0.0007 -0.0004 641.82 1589.70 1400.60 554.36 9046.19 47.47 521.66 8.4195 392 39.06 23.4190
1 1 2 0.0019 -0.0003 642.15 1591.82 1403.14 553.75 9044.07 47.49 522.28 8.4318 392 39.00 23.4236
2 1 3 -0.0043 0.0003 642.35 1587.99 1404.20 554.26 9052.94 47.27 522.42 8.4178 390 38.95 23.3442
3 1 4 0.0007 0.0000 642.35 1582.79 1401.87 554.45 9049.48 47.13 522.86 8.3682 392 38.88 23.3739
4 1 5 -0.0019 -0.0002 642.37 1582.85 1406.22 554.00 9055.15 47.28 522.19 8.4294 393 38.90 23.4044
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20626 100 196 -0.0004 -0.0003 643.49 1597.98 1428.63 551.43 9065.52 48.07 519.49 8.4956 397 38.49 22.9735
20627 100 197 -0.0016 -0.0005 643.54 1604.50 1433.58 550.86 9065.11 48.04 519.68 8.5139 395 38.30 23.1594
20628 100 198 0.0004 0.0000 643.42 1602.46 1428.18 550.94 9065.90 48.09 520.01 8.5646 398 38.44 22.9333
20629 100 199 -0.0011 0.0003 643.23 1605.26 1426.53 550.68 9073.72 48.39 519.67 8.5389 395 38.29 23.0640
20630 100 200 -0.0032 -0.0005 643.85 1600.38 1432.14 550.79 9061.48 48.20 519.30 8.5036 396 38.37 23.0522

20631 rows × 15 columns

In [12]:
# Add Remaining Useful Life (RUL) to dataset.
df_train_proc2 = add_calculated_rul(df_train_proc2)

# Visualise sensor behaviour against RUL.
plt.figure(figsize=(35, 17))
fig = plot_time_history_all_engines(df_train_proc2)
<Figure size 2520x1224 with 0 Axes>
In [13]:
from storpdm.visualize import interactive_rul_series
fig = interactive_rul_series(df_train_proc2, filename='reports/series.html')

Find best model and train

We perform a comparison of 3 different models with several combinations of hyperparameters and automatically select the best one

In [14]:
# Display processed dataset
df_train_proc2
Out[14]:
id cycle op1 op2 LPCOutletTemp HPCOutletTemp LPTOutletTemp TotalHPCOutletPres PhysCoreSpeed StaticHPCOutletPres FuelFlowRatio BypassRatio BleedEnthalpy HPTCoolantBleed LPTCoolantBleed RUL
0 1 1 -0.0007 -0.0004 641.82 1589.70 1400.60 554.36 9046.19 47.47 521.66 8.4195 392 39.06 23.4190 191
1 1 2 0.0019 -0.0003 642.15 1591.82 1403.14 553.75 9044.07 47.49 522.28 8.4318 392 39.00 23.4236 190
2 1 3 -0.0043 0.0003 642.35 1587.99 1404.20 554.26 9052.94 47.27 522.42 8.4178 390 38.95 23.3442 189
3 1 4 0.0007 0.0000 642.35 1582.79 1401.87 554.45 9049.48 47.13 522.86 8.3682 392 38.88 23.3739 188
4 1 5 -0.0019 -0.0002 642.37 1582.85 1406.22 554.00 9055.15 47.28 522.19 8.4294 393 38.90 23.4044 187
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20626 100 196 -0.0004 -0.0003 643.49 1597.98 1428.63 551.43 9065.52 48.07 519.49 8.4956 397 38.49 22.9735 4
20627 100 197 -0.0016 -0.0005 643.54 1604.50 1433.58 550.86 9065.11 48.04 519.68 8.5139 395 38.30 23.1594 3
20628 100 198 0.0004 0.0000 643.42 1602.46 1428.18 550.94 9065.90 48.09 520.01 8.5646 398 38.44 22.9333 2
20629 100 199 -0.0011 0.0003 643.23 1605.26 1426.53 550.68 9073.72 48.39 519.67 8.5389 395 38.29 23.0640 1
20630 100 200 -0.0032 -0.0005 643.85 1600.38 1432.14 550.79 9061.48 48.20 519.30 8.5036 396 38.37 23.0522 0

20631 rows × 16 columns

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler

# Define models
models = {
    #"rf": RandomForestRegressor(), # 3 minutes
    "lgb": lgb.LGBMRegressor(),
    "linear": Ridge(normalize = True),
    "mlp": Pipeline([("std",MinMaxScaler()),
                       ("m1",MLPRegressor(early_stopping=True))])
}

# Define hyperparam search space
params = {
    #"rf": {"n_estimators": [100, 500]},
    "lgb": {
        "n_estimators": [100, 500],
        "learning_rate": [0.005, 0.01],
        "max_depth":[5,10,15,20]
    },
    "mlp": {
        "m1__hidden_layer_sizes": [(128),(64, 32)],
        "m1__learning_rate_init": [0.001,0.005,0.01],
    },
    "linear": {},
}
In [49]:
X_train, X_test, y_train, y_test = prepare_training_data(
    df=df_train_proc2, target_col="RUL", discard_cols="id"
)
In [50]:
%%time
estim_grid = EstimatorSelectionHelper(models, params)
estim_grid.fit(X_train, y_train, scoring="neg_root_mean_squared_error", n_jobs=2, cv=None)
Running GridSearchCV for lgb.
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.3s
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:   36.5s finished
Running GridSearchCV for linear.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Running GridSearchCV for mlp.
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:  2.2min finished
CPU times: user 28.7 s, sys: 3.79 s, total: 32.5 s
Wall time: 2min 59s
In [51]:
estim_grid.score_summary()
Out[51]:
mean_fit_time std_fit_time mean_score_time std_score_time param_learning_rate param_max_depth param_n_estimators params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score split0_train_score split1_train_score split2_train_score split3_train_score split4_train_score mean_train_score std_train_score model param_m1__hidden_layer_sizes param_m1__learning_rate_init
9 0.883810 0.059058 0.049313 0.008101 0.01 5 500 {'learning_rate': 0.01, 'max_depth': 5, 'n_est... -36.428016 -36.120235 -35.755330 -37.375420 -35.867215 -36.309243 0.581167 1 -33.749205 -33.816692 -33.800639 -33.481610 -33.779968 -33.725623 0.124075 lgb NaN NaN
11 1.071303 0.053159 0.047066 0.001716 0.01 10 500 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... -36.367914 -36.144405 -35.845607 -37.379432 -35.902289 -36.327930 0.557628 2 -32.574781 -32.621698 -32.626584 -32.333547 -32.617511 -32.554824 0.112172 lgb NaN NaN
2 7.551598 1.055019 0.009535 0.000320 NaN NaN NaN {'m1__hidden_layer_sizes': 128, 'm1__learning_... -36.360788 -36.077179 -35.775295 -37.470710 -36.039053 -36.344605 0.592852 1 -36.261756 -36.220126 -36.407520 -35.962082 -36.337651 -36.237827 0.152107 mlp 128 0.01
4 6.255944 1.261816 0.008779 0.001424 NaN NaN NaN {'m1__hidden_layer_sizes': (64, 32), 'm1__lear... -36.345590 -36.005377 -35.854759 -37.664503 -35.853802 -36.344806 0.683819 2 -36.231544 -36.205384 -36.480643 -36.131466 -36.068617 -36.223531 0.140694 mlp (64, 32) 0.005
15 1.037422 0.119120 0.045413 0.004865 0.01 20 500 {'learning_rate': 0.01, 'max_depth': 20, 'n_es... -36.385475 -36.176642 -35.882981 -37.382386 -35.908253 -36.347147 0.549607 3 -32.445850 -32.537719 -32.568102 -32.201763 -32.551859 -32.461059 0.136415 lgb NaN NaN
13 1.033619 0.044648 0.047791 0.003105 0.01 15 500 {'learning_rate': 0.01, 'max_depth': 15, 'n_es... -36.399120 -36.160139 -35.892506 -37.380706 -35.931588 -36.352812 0.545004 4 -32.472640 -32.540541 -32.565700 -32.212848 -32.548608 -32.468068 0.131476 lgb NaN NaN
5 4.338333 0.798325 0.008215 0.001037 NaN NaN NaN {'m1__hidden_layer_sizes': (64, 32), 'm1__lear... -36.398610 -36.131074 -35.816263 -37.524760 -35.989189 -36.371979 0.607124 3 -36.340117 -36.280198 -36.444013 -36.029225 -36.336043 -36.285919 0.138829 mlp (64, 32) 0.01
3 14.174047 0.942447 0.007978 0.000496 NaN NaN NaN {'m1__hidden_layer_sizes': (64, 32), 'm1__lear... -36.458043 -36.239246 -35.936738 -37.508838 -35.885124 -36.405598 0.589624 4 -36.359851 -36.395903 -36.528664 -36.016178 -36.184544 -36.297028 0.178274 mlp (64, 32) 0.001
1 9.720873 2.161609 0.009404 0.000973 NaN NaN NaN {'m1__hidden_layer_sizes': 128, 'm1__learning_... -36.381584 -36.258647 -35.940683 -37.415924 -36.100396 -36.419447 0.519859 5 -36.283694 -36.432654 -36.498956 -35.956160 -36.384205 -36.311134 0.190863 mlp 128 0.005
1 1.006555 0.078302 0.037506 0.001589 0.005 5 500 {'learning_rate': 0.005, 'max_depth': 5, 'n_es... -36.818092 -36.567539 -36.037339 -37.936022 -36.331417 -36.738082 0.652221 5 -35.131713 -35.179080 -35.244325 -34.847641 -35.240741 -35.128700 0.146604 lgb NaN NaN
3 1.297827 0.217387 0.039087 0.002211 0.005 10 500 {'learning_rate': 0.005, 'max_depth': 10, 'n_e... -36.815244 -36.576467 -36.084477 -37.936887 -36.332804 -36.749176 0.641879 6 -34.692531 -34.734335 -34.798674 -34.437189 -34.784762 -34.689498 0.131662 lgb NaN NaN
7 1.555958 0.349995 0.042898 0.004615 0.005 20 500 {'learning_rate': 0.005, 'max_depth': 20, 'n_e... -36.822226 -36.571788 -36.091371 -37.949011 -36.335293 -36.753938 0.645027 7 -34.687369 -34.724948 -34.785039 -34.428642 -34.781105 -34.681421 0.131530 lgb NaN NaN
5 1.935861 0.357293 0.047829 0.005605 0.005 15 500 {'learning_rate': 0.005, 'max_depth': 15, 'n_e... -36.822226 -36.572308 -36.091343 -37.950412 -36.335293 -36.754316 0.645523 8 -34.687369 -34.725029 -34.784764 -34.428601 -34.781105 -34.681374 0.131508 lgb NaN NaN
0 11.293885 2.788911 0.011116 0.001976 NaN NaN NaN {'m1__hidden_layer_sizes': 128, 'm1__learning_... -39.167077 -39.757446 -39.859505 -41.930261 -38.446158 -39.832089 1.163761 6 -38.923256 -39.994593 -40.419534 -40.603182 -38.887307 -39.765574 0.729733 mlp 128 0.001
0 0.009600 0.000360 0.002942 0.000177 NaN NaN NaN {} -42.598838 -41.954426 -42.078697 -43.766672 -42.142113 -42.508149 0.665902 1 -42.470647 -42.599385 -42.603879 -42.229006 -42.580120 -42.496607 0.142358 linear NaN NaN
10 0.324150 0.077325 0.011538 0.000411 0.01 10 100 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... -42.699190 -42.649343 -41.955818 -44.134387 -42.256424 -42.739033 0.748730 9 -41.986935 -42.009507 -42.113968 -41.672861 -42.095984 -41.975851 0.159102 lgb NaN NaN
12 0.296491 0.029635 0.012283 0.001231 0.01 15 100 {'learning_rate': 0.01, 'max_depth': 15, 'n_es... -42.699190 -42.649343 -41.955818 -44.134387 -42.256424 -42.739033 0.748730 9 -41.986935 -42.009507 -42.113968 -41.672861 -42.095984 -41.975851 0.159102 lgb NaN NaN
14 0.287661 0.016971 0.012107 0.000317 0.01 20 100 {'learning_rate': 0.01, 'max_depth': 20, 'n_es... -42.699190 -42.649343 -41.955818 -44.134387 -42.256424 -42.739033 0.748730 9 -41.986935 -42.009507 -42.113968 -41.672861 -42.095984 -41.975851 0.159102 lgb NaN NaN
8 0.228564 0.010983 0.011344 0.000315 0.01 5 100 {'learning_rate': 0.01, 'max_depth': 5, 'n_est... -42.757003 -42.650229 -42.001208 -44.176711 -42.318023 -42.780635 0.746741 12 -42.129717 -42.130339 -42.240985 -41.807522 -42.213291 -42.104371 0.154896 lgb NaN NaN
6 0.572081 0.086572 0.011890 0.000652 0.005 20 100 {'learning_rate': 0.005, 'max_depth': 20, 'n_e... -51.189618 -51.258422 -50.706188 -52.849476 -50.807008 -51.362142 0.773451 13 -51.055151 -51.052018 -51.154014 -50.706460 -51.140228 -51.021574 0.163077 lgb NaN NaN
4 0.251677 0.020732 0.010172 0.000304 0.005 15 100 {'learning_rate': 0.005, 'max_depth': 15, 'n_e... -51.189618 -51.258422 -50.706188 -52.849476 -50.807008 -51.362142 0.773451 13 -51.055151 -51.052018 -51.154014 -50.706460 -51.140228 -51.021574 0.163077 lgb NaN NaN
2 0.258065 0.008950 0.010309 0.000374 0.005 10 100 {'learning_rate': 0.005, 'max_depth': 10, 'n_e... -51.189618 -51.258422 -50.706188 -52.849476 -50.807008 -51.362142 0.773451 13 -51.055151 -51.052018 -51.154014 -50.706460 -51.140228 -51.021574 0.163077 lgb NaN NaN
0 0.256260 0.032656 0.011642 0.001354 0.005 5 100 {'learning_rate': 0.005, 'max_depth': 5, 'n_es... -51.244825 -51.310150 -50.777248 -52.873855 -50.875208 -51.416257 0.757140 16 -51.140001 -51.147479 -51.225999 -50.785174 -51.225740 -51.104879 0.164035 lgb NaN NaN
In [52]:
model = estim_grid.best_model
model
Out[52]:
LGBMRegressor(learning_rate=0.01, max_depth=5, n_estimators=500)
In [53]:
y_test_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Random Forest Regression test MAE: {mae:.3f}")
print(f"Random Forest Regression test RMSE: {rmse:.3f}")
Random Forest Regression test MAE: 24.990
Random Forest Regression test RMSE: 35.540
In [54]:
# Predictions on "test" dataset # TODO
# Not so interesting for this exercise

Root-cause analysis. Model diagnostics and interpretation

In [61]:
df_plot = pd.DataFrame({"y_test":y_test, "y_test_pred": y_test_pred,'cycle':X_test.cycle})
df_plot = df_plot.sort_values('y_test')
In [62]:
# TODO: plot predicted RUL vs actual RUL
# Order by RUL, should be a nice graph
import numpy as np
import plotly.graph_objects as go

# Create traces
fig = go.Figure()
fig.add_trace(
    go.Scattergl(
        x=df_plot["y_test"],
        y=df_plot["y_test_pred"],
        mode="markers",
        marker=dict(
            color=np.abs(df_plot["y_test"] - df_plot["y_test_pred"]),
            colorscale="Viridis",
            line_width=0,
            opacity=0.7,
            size=6,
        ),
        name="lines",
    )
)
fig.add_shape(type="line", x0=0, y0=0, x1=350, y1=350, line=dict(width=4, dash="dot",color='red'))
fig.update_layout(
    title="Actual vs predicted RUL",
    xaxis_title="Actual RUL",
    yaxis_title="Predicted"
)
fig.show()

Display the predicted vs actual RUL for a couple of engines

In [85]:
y_train_pred = model.predict(df_train_proc2.drop(['RUL','id'],axis=1))
df_plot = pd.DataFrame({"y_train":df_train_proc2.RUL, "y_train_pred": y_train_pred,'id':df_train_proc2.id})

# Create traces
fig = go.Figure()

engine_list  = [96,39,80,71]

for engine in engine_list:
    idx = df_plot.id == engine
    df_plot_filter = df_plot[idx]
    fig.add_trace(
        go.Scattergl(
            x=df_plot_filter["y_train"],
            y=df_plot_filter["y_train_pred"],
            #mode="markers",
            marker=dict(
                color=np.abs(df_plot_filter["y_train"] - df_plot_filter["y_train_pred"]),
                colorscale="Viridis",
                line_width=0,
                opacity=0.7,
                size=6,
            ),
            name=f"Enigne {engine}",
        )
    )
fig.add_shape(type="line", x0=0, y0=0, x1=350, y1=350, line=dict(width=4, dash="dot",color='red'))
fig.update_layout(
    title="Actual vs predicted RUL",
    xaxis_title="Actual RUL",
    yaxis_title="Predicted"
)
fig.show()
In [ ]:
 
In [ ]:
 
In [87]:
# TODO: Shap feature importance ---> FIXME
import shap
data_shap = shap.kmeans(X_train, 50)
explainer = shap.KernelExplainer(model.predict, data=data_shap)
In [88]:
X_sample = X_train.sample(100)
shap_values = explainer.shap_values(X_sample, l1_reg = 'num_features(100)')

In [26]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X_sample)
In [27]:
from pdpbox import pdp, get_dataset, info_plots

features = ['cycle','StaticHPCOutletPres','PhysCoreSpeed','LPCOutletTemp','FuelFlowRatio']

for f in features:
    
    pdp_goals = pdp.pdp_isolate(model = model, 
                                dataset = X_sample, 
                                model_features = X_train.columns,
                                feature = f)
    pdp.pdp_plot(pdp_goals, f, x_quantile=False,  plot_pts_dist=True, plot_lines=True,center=True)

Classification model

Will the engine fail in less than 10 cycles? With what probability?

In [99]:
cycle_threshold = 10
df_train_proc2['RUL_thres'] = np.where(df_train_proc2['RUL'] <= cycle_threshold, 1, 0)
In [29]:
from sklearn.neural_network import MLPClassifier 
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# Define models
models = {
    #"rf": RandomForestRegressor(), # 3 minutes
    "lgb": lgb.LGBMClassifier(),
    "logistic": LogisticRegression(max_iter=300),
    "mlp": Pipeline([("std",MinMaxScaler()),
                       ("m1",MLPClassifier(early_stopping=True))])
}

# Define hyperparam search space
params = {
    #"rf": {"n_estimators": [100, 500]},
    "lgb": {
        "n_estimators": [100, 500],
        "learning_rate": [0.005, 0.01],
        "max_depth":[5,10,15,20]
    },
    "mlp": {
        "m1__hidden_layer_sizes": [(128),(64, 32)],
        "m1__learning_rate_init": [0.001,0.005,0.01],
    },
    "logistic": {},
}
In [30]:
X_train, X_test, y_train, y_test = prepare_training_data(
    df=df_train_proc2, target_col="RUL_thres", discard_cols=["id","RUL"]
)
In [45]:
%%time
estim_grid_clf = EstimatorSelectionHelper(models, params)
estim_grid_clf.fit(X_train, y_train, scoring="f1", n_jobs=2, cv=None)
Running GridSearchCV for lgb.
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   14.9s
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:   25.5s finished
Running GridSearchCV for logistic.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished
Running GridSearchCV for mlp.
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:   31.7s finished
CPU times: user 13.4 s, sys: 1.11 s, total: 14.5 s
Wall time: 1min
In [46]:
estim_grid_clf.score_summary().head()
Out[46]:
mean_fit_time std_fit_time mean_score_time std_score_time param_learning_rate param_max_depth param_n_estimators params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score split0_train_score split1_train_score split2_train_score split3_train_score split4_train_score mean_train_score std_train_score model param_m1__hidden_layer_sizes param_m1__learning_rate_init
15 0.773288 0.078503 0.031086 0.003243 0.01 20 500 {'learning_rate': 0.01, 'max_depth': 20, 'n_es... 0.842105 0.836158 0.820809 0.807910 0.821114 0.825619 0.012165 1 0.993612 0.990774 0.994318 0.995733 0.995733 0.994034 0.001825 lgb NaN NaN
9 0.681190 0.077034 0.035963 0.000900 0.01 5 500 {'learning_rate': 0.01, 'max_depth': 5, 'n_est... 0.849711 0.828571 0.821530 0.800000 0.816327 0.823228 0.016248 2 0.923843 0.923735 0.930466 0.932384 0.927350 0.927556 0.003470 lgb NaN NaN
13 0.830603 0.033889 0.031876 0.001241 0.01 15 500 {'learning_rate': 0.01, 'max_depth': 15, 'n_es... 0.835294 0.825843 0.815029 0.804533 0.810651 0.818270 0.010995 3 0.992193 0.990071 0.995025 0.995018 0.996441 0.993750 0.002299 lgb NaN NaN
3 2.492965 0.212066 0.010478 0.002431 NaN NaN NaN {'m1__hidden_layer_sizes': (64, 32), 'm1__lear... 0.823864 0.841499 0.813754 0.803468 0.803519 0.817221 0.014300 1 0.819088 0.815847 0.814224 0.825581 0.824121 0.819772 0.004457 mlp (64, 32) 0.001
11 0.812212 0.019677 0.034056 0.001473 0.01 10 500 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... 0.835294 0.821530 0.811594 0.801136 0.814159 0.816743 0.011347 4 0.985117 0.984353 0.991465 0.985755 0.988604 0.987059 0.002630 lgb NaN NaN
In [47]:
model = estim_grid_clf.best_model
y_test_pred = model.predict(X_test)
f1 = f1_score(y_test, y_test_pred)
print(f"F1 score test: {f1:.3f}")
F1 score test: 0.831
In [35]:
from storpdm.utils import metrics_threshold
from storpdm.build_features import prepare_training_data
from storpdm.visualize import display_roc_pr

y_score = model.predict_proba(X_test)
metrics = metrics_threshold(y_score[:,1], y_test)

display_roc_pr(precision = metrics['prec'],
              recall = metrics['rec'],
              fpr = metrics['fpr'],
              thres = metrics['thres'])
In [37]:
# TODO: display table too
display(
    pd.DataFrame(
        {
            "thresholds": metrics["thres"],
            "precision": metrics["prec"],
            "recall": metrics["rec"],
            "fpr": np.round(metrics["fpr"],3),
            "TP": metrics["tp"],
            "TN": metrics["tn"],
            "FP": metrics["fp"],
            "FN": metrics["fn"],
        }
    )
     .loc[::10]
    .loc[::-1]
)
thresholds precision recall fpr TP TN FP FN
100 1.0 NaN 0.000000 0.000 0 3904 0 223
90 0.9 0.961538 0.448430 0.001 100 3900 4 123
80 0.8 0.928105 0.636771 0.003 142 3893 11 81
70 0.7 0.901639 0.739910 0.005 165 3886 18 58
60 0.6 0.866337 0.784753 0.007 175 3877 27 48
50 0.5 0.836364 0.825112 0.009 184 3868 36 39
40 0.4 0.817021 0.860987 0.011 192 3861 43 31
30 0.3 0.786561 0.892377 0.014 199 3850 54 24
20 0.2 0.743772 0.937220 0.018 209 3832 72 14
10 0.1 0.691558 0.955157 0.024 213 3809 95 10
0 0.0 0.054034 1.000000 1.000 223 0 3904 0

First we define the following metrics:

  • Precision: fraction of predictions were correct
  • False positive rate: fraction of times we predict a failure and then it is not
  • Recall: fraction of failures were predicted. Amongst all failures, how many do we predict correctly?
  • Threshold translates probability of failure to a binary variable.

Looking at the ROC-Precision graph above, we can choose an optimal threshold based on the business needs.

A higher threshold (for example, 0.8) means there fpr is 0.003, 0.928 precision and 0.637 recall. This means we are very confident of our predictions but 0.363% of the failures we miss. This threshold is optimal if the cost

Let's look at the confusion matrix with a threshold of 0.5:

In [39]:
from storpdm.visualize import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(confusionMatrix, normalize = True, classes=["No failure (N)","Failure within 10 days (P)"])
Confusion matrix before normalization
[[3868   36]
 [  39  184]]
Normalized confusion matrix
In [40]:
plot_confusion_matrix(confusionMatrix, normalize = True, classes=["No failure (N)","Failure within 10 days (P)"])
Confusion matrix before normalization
[[3868   36]
 [  39  184]]
Normalized confusion matrix
In [44]:
from plotly.subplots import make_subplots

def display_tp(thres,tp, fp, title="", fig=None, i=0, name1 = "", name2 = ""):

    colors = [
        "#1f77b4",  # muted blue
        "#ff7f0e",  # safety orange
        "#2ca02c",  # cooked asparagus green
        "#d62728",  # brick red
        "#9467bd",  # muted purple
        "#8c564b",  # chestnut brown
        "#e377c2",  # raspberry yogurt pink
        "#7f7f7f",  # middle gray
        "#bcbd22",  # curry yellow-green
        "#17becf",  # blue-teal
    ]
    hovertext = [
        f"TP: {r}<br>FP: {p}<br>Thres: {t:.2f}"
        for r, p, t in zip(tp,fp, thres)
    ]

    # ROC curve Train
    lw = 2
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    opacity=0.8
    
    ## PRecision
    fig.add_trace(
        go.Bar(
            x=thres[::10],
            y=tp[::10],
            name=name1,
            hovertext=hovertext[::10],
            hoverinfo="text",
            marker_color =colors[9],
            opacity=opacity,
            xaxis='x1',
            yaxis='y1'
        ),
        secondary_y=False,
        
    )
    fig.add_trace(
        go.Scatter(
            x=thres,
            y=tp,
            mode="lines",
            name=name1,
            hovertext=hovertext,
            hoverinfo="text",
            line=dict(color=colors[9]),
            showlegend=False,
            xaxis='x1',
            yaxis='y1'
        ),
        secondary_y=False
    )


    ## recall curve
    fig.add_trace(
        go.Bar(
            x=thres[::10],
            y=fp[::10],
#             mode="lines",
            name=name2,
            hovertext=hovertext[::10],
            hoverinfo="text",
            marker_color =colors[4],
            opacity=opacity,
            xaxis='x1',
            yaxis='y1'
        ),
        secondary_y=False
    )
    fig.add_trace(
        go.Scatter(
            x=thres,
            y=fp,
            mode="lines",
            name=name2,
            hovertext=hovertext,
            hoverinfo="text",
            line=dict(color=colors[4]),
            showlegend=False,
            xaxis='x1',
            yaxis='y1'
        ),
        secondary_y=False
    )
        
    
    
    fig.update_yaxes(title_text="")
    fig.update_xaxes(title_text="Limite probabilidad modelo")
    fig.update_yaxes(range=[0, 210])
    fig.update_xaxes(range=[.25, 0.95])
    
    fig.update_layout(height=500)

    return fig
    

fig = display_tp(
    thres=metrics["thres"],
    tp=metrics["tp"],
    fp=metrics["fp"],
    title="",
    fig=None,
    i=0,
    name1 = "Averías detectadas",
    name2 = 'Falsos positivos'
)
#plotly.offline.plot(fig, filename='reports/comparativa_metricas.html')
fig
In [116]:
y_train_pred = estim_grid_clf.best_model.predict_proba(df_train_proc2.drop(['RUL','id','RUL_thres'],axis=1))[:,1]
df_plot = pd.DataFrame({"y_train":df_train_proc2.RUL, "y_train_pred": y_train_pred,'id':df_train_proc2.id})

# Create traces
fig = go.Figure()

engine = 96
idx = df_plot.id == engine
df_plot_filter = df_plot[idx]

fig.add_trace(
    go.Bar(
        x=df_plot_filter["y_train"],
        y=df_plot_filter["y_train_pred"],
        name=f"Enigne {engine}",
    )
)
fig.add_shape(type="line", x0=10, y0=0, x1=10, y1=1, line=dict(width=4, dash="dot",color='red'))
fig.update_layout(
    title=f"Predicted probabilty of failure vs RUL engine id = {engine}",
    xaxis_title="RUL",
    yaxis_title="Predicted probability"
)
fig.show()
In [117]:
fig.show(renderer="notebook")